library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
# Dataset COVID --> https://www.kaggle.com/datasets/sudalairajkumar/covid19-in-usa 

" Context
Data is obtained from COVID-19 Tracking project and NYTimes. Sincere thanks to them for making it available to the public.

Coronaviruses are a large family of viruses which may cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19 - World Health Organization

The number of new cases are increasing day by day around the world. This dataset has information from 50 US states and the District of Columbia at daily level."
## [1] " Context\nData is obtained from COVID-19 Tracking project and NYTimes. Sincere thanks to them for making it available to the public.\n\nCoronaviruses are a large family of viruses which may cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19 - World Health Organization\n\nThe number of new cases are increasing day by day around the world. This dataset has information from 50 US states and the District of Columbia at daily level."
setwd("C:/Users/saksh/OneDrive/Desktop/R progamme/DA-Theory")
dataset <-read.csv("us_counties_covid19_daily.csv")

Data Exploration

str(dataset)
## 'data.frame':    800437 obs. of  6 variables:
##  $ date  : chr  "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
##  $ county: chr  "Snohomish" "Snohomish" "Snohomish" "Cook" ...
##  $ state : chr  "Washington" "Washington" "Washington" "Illinois" ...
##  $ fips  : num  53061 53061 53061 17031 53061 ...
##  $ cases : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ deaths: num  0 0 0 0 0 0 0 0 0 0 ...
View(dataset)
## Getting to know more about categorical data ##
length(unique(dataset$county))
## [1] 1929
length(unique(dataset$date))
## [1] 320
length(unique(dataset$state))
## [1] 55

Knowing number of empty cells in each column

for(i in names(dataset))
{
  print(c(i,sum(is.na(dataset[[i]]))))
}
## [1] "date" "0"   
## [1] "county" "0"     
## [1] "state" "0"    
## [1] "fips" "7591"
## [1] "cases" "0"    
## [1] "deaths" "16733"

Removing rows with empty columns

dataset <- na.omit(dataset) 

Length of dataset after reoving some rows with empty space

nrow(dataset)
## [1] 776113

Grouping data by state and taking into account all the death(sum)

newdata <- group_by(dataset, state)
DeathsStates <- summarize(newdata,TotalDeaths = sum(deaths, na.rm=TRUE))

DeathsStates
## # A tibble: 53 x 2
##    state                TotalDeaths
##    <chr>                      <dbl>
##  1 Alabama                   413394
##  2 Alaska                      8638
##  3 Arizona                   845256
##  4 Arkansas                  203412
##  5 California               2479209
##  6 Colorado                  432318
##  7 Connecticut               974624
##  8 Delaware                  123490
##  9 District of Columbia      127280
## 10 Florida                  2101347
## # ... with 43 more rows
View(DeathsStates)

Grouping data by state and taking into account all the cases(sum)

newdata <- group_by(dataset, state)
CasesStates <- summarize(newdata,TotalCases = sum(cases, na.rm=TRUE))

View(CasesStates)
CasesStates
## # A tibble: 53 x 2
##    state                TotalCases
##    <chr>                     <int>
##  1 Alabama                24024159
##  2 Alaska                  1732614
##  3 Arizona                35564131
##  4 Arkansas               12982442
##  5 California            127488265
##  6 Colorado               15541584
##  7 Connecticut            12976009
##  8 Delaware                3841367
##  9 District of Columbia    2965627
## 10 Florida               107097126
## # ... with 43 more rows

Merging the above 2 according to state in such a way that dataframe will contain state,totalcases,totaldeaths

TotalDeathsandCasesStates <- merge(CasesStates,DeathsStates)
TotalDeathsandCasesStates
##                       state TotalCases TotalDeaths
## 1                   Alabama   24024159      413394
## 2                    Alaska    1732614        8638
## 3                   Arizona   35564131      845256
## 4                  Arkansas   12982442      203412
## 5                California  127488265     2479209
## 6                  Colorado   15541584      432318
## 7               Connecticut   12976009      974624
## 8                  Delaware    3841367      123490
## 9      District of Columbia    2965627      127280
## 10                  Florida  107097126     2101347
## 11                  Georgia   47219809     1102543
## 12                   Hawaii    1609158       20057
## 13                    Idaho    7285734       76386
## 14                 Illinois   58791023     1759070
## 15                  Indiana   24135371      732228
## 16                     Iowa   16384096      246471
## 17                   Kansas   10723922      125276
## 18                 Kentucky   12581483      217679
## 19                Louisiana   27560697      973852
## 20                    Maine    1070475       28245
## 21                 Maryland   22726263      764945
## 22            Massachusetts   28741887     1917453
## 23                 Michigan   29322103     1529416
## 24                Minnesota   19697867      399032
## 25              Mississippi   15549961      459146
## 26                 Missouri   19493520      368377
## 27                  Montana    3039448       36557
## 28                 Nebraska    9094011       94108
## 29                   Nevada   13109245      248426
## 30            New Hampshire    1808423       86192
## 31               New Jersey   46122131     3345525
## 32               New Mexico    6288061      162113
## 33                 New York   48371010     2050253
## 34           North Carolina   34135408      574204
## 35             North Dakota    4365161       55013
## 36 Northern Mariana Islands       9994         290
## 37                     Ohio   29227779      855717
## 38                 Oklahoma   13916649      177146
## 39                   Oregon    5741679       94902
## 40             Pennsylvania   33208997     1648592
## 41             Rhode Island    4686126      209317
## 42           South Carolina   22343348      491539
## 43             South Dakota    4792546       52542
## 44                Tennessee   31128640      387269
## 45                    Texas  119483867     2261899
## 46                     Utah   13588710       81563
## 47                  Vermont     396976       13520
## 48           Virgin Islands     167365        2370
## 49                 Virginia   24999703      548547
## 50               Washington   16153040      421336
## 51            West Virginia    2855598       55763
## 52                Wisconsin   25129253      289315
## 53                  Wyoming    1527219       11733
View(TotalDeathsandCasesStates)

Interactive Barchart

# Telling about total Deaths happened in each state
fig <- plot_ly(data = DeathsStates,type = "bar",x = ~state,y = ~TotalDeaths,marker = list(color = 'red'))
fig <- fig %>% layout(title = "State vs TotalDeaths")
fig
This tells regarding Total Deaths in each state

Grouped Barchart

## This give side by side by side information regarding 2 important topics i.e Cases Tot and Death tot according to the state which will help to know out of the effected cases how many are dieing and how many are recovering

## As the Number of cases are far more than deaths hence dividing it by 20
newdata<-mutate(TotalDeathsandCasesStates,TotalCases = TotalCases/20)

### Grouped Bar chart ###
fig_bar <- plot_ly(data=newdata,type="bar",x=~state,y=~TotalCases,name="CasesTot")
fig_bar <- fig_bar%>% add_trace(y=~TotalDeaths,name="DeathsTot")


fig_bar <- fig_bar%>%layout(barmode='group',annotations = list(
 list(text = "Number of cases is divided via 20")))
fig_bar

Stacked Barchart

## This give side by side by side information regarding 2 important topics i.e Cases Tot and Death tot according to the state which will help to know out of the effected cases how many are dieing and how many are recovering

fig_bar <- plot_ly(data=newdata,type="bar",x=~state,y=~TotalCases,name="CasesTot")
fig_bar <- fig_bar%>% add_trace(y=~TotalDeaths,name="DeathsTot")
fig_bar <- fig_bar%>%layout(barmode='stack',annotations = list(text = "Number of cases is divided via 20"))
fig_bar

Line plot

# This shows the trend between the TotalCases and TotalDeaths in each state
ggplot(data = TotalDeathsandCasesStates,
mapping = aes(x = TotalCases, y = TotalDeaths,size = TotalCases,color = TotalDeaths)) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Scatter Plot

# This tells us information regarding total deaths and total cases

ggplot(data = TotalDeathsandCasesStates,
mapping = aes(x = TotalCases, y = TotalDeaths,size = TotalCases,color = TotalDeaths)) +
geom_point(alpha = .7) 

### Animating scatterplot

library(gganimate)
## Warning: package 'gganimate' was built under R version 4.1.3
library(gifski)
## Warning: package 'gifski' was built under R version 4.1.3
### Animate --> Scatter Plot

scatter_plot_animate = ggplot(data=TotalDeathsandCasesStates, aes(TotalCases, TotalDeaths)) + geom_point() +                
                       transition_states(state)

animate(scatter_plot_animate, renderer = gifski_renderer())

Data containing total cases on each date

newdata <- group_by(dataset, date)
DeathsdateSum <- summarize(newdata,TotalDeaths = sum(deaths, na.rm=TRUE))
DeathsdateSum
## # A tibble: 320 x 2
##    date       TotalDeaths
##    <chr>            <dbl>
##  1 2020-01-21           0
##  2 2020-01-22           0
##  3 2020-01-23           0
##  4 2020-01-24           0
##  5 2020-01-25           0
##  6 2020-01-26           0
##  7 2020-01-27           0
##  8 2020-01-28           0
##  9 2020-01-29           0
## 10 2020-01-30           0
## # ... with 310 more rows

Time Series

## This is a time series plot showing day to day increase/decrease in Total Deaths

fig <- plot_ly(DeathsdateSum, type = 'scatter', mode = 'lines')%>%
  add_trace(x = ~date, y = ~TotalDeaths)%>%
  layout(showlegend = F)
fig <- fig %>%
  layout(
         xaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff'),
         yaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff'),
         plot_bgcolor='#e5ecf6', width = 900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
## Warning: Can't display both discrete & non-discrete data on same axis

Density plot

## Shows average density of deaths each day ## 
ggplot(DeathsdateSum,
aes(x = TotalDeaths)) +
geom_density(alpha = 0.4,fill = "Cornflowerblue") +
labs(title = "Deaths distribution according to dates")

Time Series

### Data containing total Cases on each date ###
newdata <- group_by(dataset, date)
CasesdateSum <- summarize(newdata,TotalCases = sum(cases, na.rm=TRUE))
CasesdateSum
## # A tibble: 320 x 2
##    date       TotalCases
##    <chr>           <int>
##  1 2020-01-21          1
##  2 2020-01-22          1
##  3 2020-01-23          1
##  4 2020-01-24          2
##  5 2020-01-25          3
##  6 2020-01-26          5
##  7 2020-01-27          5
##  8 2020-01-28          5
##  9 2020-01-29          5
## 10 2020-01-30          6
## # ... with 310 more rows
### Time Series ###

## This is a time series plot showing day to day increase/decrease in Total Cases
fig <- plot_ly(CasesdateSum, type = 'scatter', mode = 'lines')%>%
  add_trace(x = ~date, y = ~TotalCases)%>%
  layout(showlegend = F)
fig <- fig %>%
  layout(
         xaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff'),
         yaxis = list(zerolinecolor = '#ffff',
                      zerolinewidth = 2,
                      gridcolor = 'ffff'),
         plot_bgcolor='#e5ecf6', width = 900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
## Warning: Can't display both discrete & non-discrete data on same axis

Density plot

## Shows average density of deaths each day ## 
ggplot(CasesdateSum,
aes(x = TotalCases)) +
geom_density(alpha = 0.4,fill = "Cornflowerblue") +
labs(title = "Cases each day frequency")

### This tells total number of Deaths from each state ###

### As there are many states hence reducing the numbers via considering states whose contribution is lesser than 2% as others
TotalDeathsandCasesStates
##                       state TotalCases TotalDeaths
## 1                   Alabama   24024159      413394
## 2                    Alaska    1732614        8638
## 3                   Arizona   35564131      845256
## 4                  Arkansas   12982442      203412
## 5                California  127488265     2479209
## 6                  Colorado   15541584      432318
## 7               Connecticut   12976009      974624
## 8                  Delaware    3841367      123490
## 9      District of Columbia    2965627      127280
## 10                  Florida  107097126     2101347
## 11                  Georgia   47219809     1102543
## 12                   Hawaii    1609158       20057
## 13                    Idaho    7285734       76386
## 14                 Illinois   58791023     1759070
## 15                  Indiana   24135371      732228
## 16                     Iowa   16384096      246471
## 17                   Kansas   10723922      125276
## 18                 Kentucky   12581483      217679
## 19                Louisiana   27560697      973852
## 20                    Maine    1070475       28245
## 21                 Maryland   22726263      764945
## 22            Massachusetts   28741887     1917453
## 23                 Michigan   29322103     1529416
## 24                Minnesota   19697867      399032
## 25              Mississippi   15549961      459146
## 26                 Missouri   19493520      368377
## 27                  Montana    3039448       36557
## 28                 Nebraska    9094011       94108
## 29                   Nevada   13109245      248426
## 30            New Hampshire    1808423       86192
## 31               New Jersey   46122131     3345525
## 32               New Mexico    6288061      162113
## 33                 New York   48371010     2050253
## 34           North Carolina   34135408      574204
## 35             North Dakota    4365161       55013
## 36 Northern Mariana Islands       9994         290
## 37                     Ohio   29227779      855717
## 38                 Oklahoma   13916649      177146
## 39                   Oregon    5741679       94902
## 40             Pennsylvania   33208997     1648592
## 41             Rhode Island    4686126      209317
## 42           South Carolina   22343348      491539
## 43             South Dakota    4792546       52542
## 44                Tennessee   31128640      387269
## 45                    Texas  119483867     2261899
## 46                     Utah   13588710       81563
## 47                  Vermont     396976       13520
## 48           Virgin Islands     167365        2370
## 49                 Virginia   24999703      548547
## 50               Washington   16153040      421336
## 51            West Virginia    2855598       55763
## 52                Wisconsin   25129253      289315
## 53                  Wyoming    1527219       11733
## Data containing all the states having contribution of cases to the whole greater than 2% ##
newdata <- select(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) > 0.02),state,TotalDeaths)

## Data containing all the states having contribution of cases to the whole greater than 2% ##
sumOTHERdeaths = sum(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) <= 0.02)$TotalDeaths)

newdata[nrow(newdata) + 1,] <- c("Others",sumOTHERdeaths)
newdata
##            state TotalDeaths
## 1        Arizona      845256
## 2     California     2479209
## 3    Connecticut      974624
## 4        Florida     2101347
## 5        Georgia     1102543
## 6       Illinois     1759070
## 7        Indiana      732228
## 8      Louisiana      973852
## 9       Maryland      764945
## 10 Massachusetts     1917453
## 11      Michigan     1529416
## 12    New Jersey     3345525
## 13      New York     2050253
## 14          Ohio      855717
## 15  Pennsylvania     1648592
## 16         Texas     2261899
## 17        Others     7342966

Piechart

### Piechart ###
piechart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalDeaths,type = 'pie')

piechart

DonutChart

### This tells total number of Deaths from each state ###
DonutChart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalDeaths,type = 'pie',hole = 0.5)
DonutChart
### This tells total number of Cases from each state ###

### As there are many states hence reducing the numbers via considering states whose contribution is lesser than 2% as others
TotalDeathsandCasesStates
##                       state TotalCases TotalDeaths
## 1                   Alabama   24024159      413394
## 2                    Alaska    1732614        8638
## 3                   Arizona   35564131      845256
## 4                  Arkansas   12982442      203412
## 5                California  127488265     2479209
## 6                  Colorado   15541584      432318
## 7               Connecticut   12976009      974624
## 8                  Delaware    3841367      123490
## 9      District of Columbia    2965627      127280
## 10                  Florida  107097126     2101347
## 11                  Georgia   47219809     1102543
## 12                   Hawaii    1609158       20057
## 13                    Idaho    7285734       76386
## 14                 Illinois   58791023     1759070
## 15                  Indiana   24135371      732228
## 16                     Iowa   16384096      246471
## 17                   Kansas   10723922      125276
## 18                 Kentucky   12581483      217679
## 19                Louisiana   27560697      973852
## 20                    Maine    1070475       28245
## 21                 Maryland   22726263      764945
## 22            Massachusetts   28741887     1917453
## 23                 Michigan   29322103     1529416
## 24                Minnesota   19697867      399032
## 25              Mississippi   15549961      459146
## 26                 Missouri   19493520      368377
## 27                  Montana    3039448       36557
## 28                 Nebraska    9094011       94108
## 29                   Nevada   13109245      248426
## 30            New Hampshire    1808423       86192
## 31               New Jersey   46122131     3345525
## 32               New Mexico    6288061      162113
## 33                 New York   48371010     2050253
## 34           North Carolina   34135408      574204
## 35             North Dakota    4365161       55013
## 36 Northern Mariana Islands       9994         290
## 37                     Ohio   29227779      855717
## 38                 Oklahoma   13916649      177146
## 39                   Oregon    5741679       94902
## 40             Pennsylvania   33208997     1648592
## 41             Rhode Island    4686126      209317
## 42           South Carolina   22343348      491539
## 43             South Dakota    4792546       52542
## 44                Tennessee   31128640      387269
## 45                    Texas  119483867     2261899
## 46                     Utah   13588710       81563
## 47                  Vermont     396976       13520
## 48           Virgin Islands     167365        2370
## 49                 Virginia   24999703      548547
## 50               Washington   16153040      421336
## 51            West Virginia    2855598       55763
## 52                Wisconsin   25129253      289315
## 53                  Wyoming    1527219       11733
## Data containing all the states having contribution of cases to the whole greater than 2% ##
newdata <- select(filter(TotalDeathsandCasesStates,TotalCases/sum(TotalCases) > 0.02),state,TotalCases)

## Data containing all the states having contribution of cases to the whole greater than 2% ##
sumOTHERCases = sum(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) <= 0.02)$TotalCases)

newdata[nrow(newdata) + 1,] <- c("Others",sumOTHERCases)
newdata
##             state TotalCases
## 1         Arizona   35564131
## 2      California  127488265
## 3         Florida  107097126
## 4         Georgia   47219809
## 5        Illinois   58791023
## 6         Indiana   24135371
## 7       Louisiana   27560697
## 8   Massachusetts   28741887
## 9        Michigan   29322103
## 10     New Jersey   46122131
## 11       New York   48371010
## 12 North Carolina   34135408
## 13           Ohio   29227779
## 14   Pennsylvania   33208997
## 15      Tennessee   31128640
## 16          Texas  119483867
## 17       Virginia   24999703
## 18      Wisconsin   25129253
## 19         Others  404760612

Piechart

### This tells total number of Cases from each state ###
piechart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalCases,type = 'pie')

piechart

Donut Chart

DonutChart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalCases,type = 'pie',hole = 0.5)
DonutChart
## Tells regarding Places where need to focus more
ggplot(TotalDeathsandCasesStates, aes(x=TotalCases, y=TotalDeaths)) +
 geom_boxplot(color ="blue")+
 labs(x="TotalCases", y="TotalDeaths")+
 ggtitle("box plot of Total Cases vs TotalDeaths ")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

View(TotalDeathsandCasesStates)
## Tells regarding Places where need to focus more
ggplot(TotalDeathsandCasesStates, aes(x=TotalCases, y=TotalDeaths)) +
 geom_boxplot(width=0.3,color ="yellow")+
 geom_violin(width=2,alpha=0.5,color ="red")+
 labs(x="Total Cases", y="Total Deaths")+
 ggtitle("Total Cases vs Total Deaths")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ZomatoData <- read.csv("zomato.csv")
View(ZomatoData)